Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/querier/worker/scheduler_processor.go

234 lines
8.1 KiB

package worker
import (
"context"
"fmt"
"net/http"
"strings"
"time"
"github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb"
querier_stats "github.com/cortexproject/cortex/pkg/querier/stats"
"github.com/cortexproject/cortex/pkg/ring/client"
"github.com/cortexproject/cortex/pkg/scheduler/schedulerpb"
util_log "github.com/cortexproject/cortex/pkg/util/log"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/backoff"
"github.com/grafana/dskit/grpcclient"
dskit_middleware "github.com/grafana/dskit/middleware"
"github.com/grafana/dskit/services"
otgrpc "github.com/opentracing-contrib/go-grpc"
"github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/weaveworks/common/httpgrpc"
"github.com/weaveworks/common/middleware"
"github.com/weaveworks/common/user"
"google.golang.org/grpc"
"google.golang.org/grpc/health/grpc_health_v1"
lokigrpc "github.com/grafana/loki/pkg/util/httpgrpc"
)
func newSchedulerProcessor(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (*schedulerProcessor, []services.Service) {
p := &schedulerProcessor{
log: log,
handler: handler,
maxMessageSize: cfg.GRPCClientConfig.MaxSendMsgSize,
querierID: cfg.QuerierID,
grpcConfig: cfg.GRPCClientConfig,
frontendClientRequestDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "cortex_querier_query_frontend_request_duration_seconds",
Help: "Time spend doing requests to frontend.",
Buckets: prometheus.ExponentialBuckets(0.001, 4, 6),
}, []string{"operation", "status_code"}),
}
frontendClientsGauge := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "cortex_querier_query_frontend_clients",
Help: "The current number of clients connected to query-frontend.",
})
poolConfig := client.PoolConfig{
CheckInterval: 5 * time.Second,
HealthCheckEnabled: true,
HealthCheckTimeout: 1 * time.Second,
}
p.frontendPool = client.NewPool("frontend", poolConfig, nil, p.createFrontendClient, frontendClientsGauge, log)
return p, []services.Service{p.frontendPool}
}
// Handles incoming queries from query-scheduler.
type schedulerProcessor struct {
log log.Logger
handler RequestHandler
grpcConfig grpcclient.Config
maxMessageSize int
querierID string
frontendPool *client.Pool
frontendClientRequestDuration *prometheus.HistogramVec
}
// notifyShutdown implements processor.
func (sp *schedulerProcessor) notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string) {
client := schedulerpb.NewSchedulerForQuerierClient(conn)
req := &schedulerpb.NotifyQuerierShutdownRequest{QuerierID: sp.querierID}
if _, err := client.NotifyQuerierShutdown(ctx, req); err != nil {
// Since we're shutting down there's nothing we can do except logging it.
level.Warn(sp.log).Log("msg", "failed to notify querier shutdown to query-scheduler", "address", address, "err", err)
}
}
func (sp *schedulerProcessor) processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string) {
schedulerClient := schedulerpb.NewSchedulerForQuerierClient(conn)
backoff := backoff.New(ctx, processorBackoffConfig)
for backoff.Ongoing() {
c, err := schedulerClient.QuerierLoop(ctx)
if err == nil {
err = c.Send(&schedulerpb.QuerierToScheduler{QuerierID: sp.querierID})
}
if err != nil {
level.Error(sp.log).Log("msg", "error contacting scheduler", "err", err, "addr", address)
backoff.Wait()
continue
}
if err := sp.querierLoop(c, address); err != nil {
// E.Welch I don't know how to do this any better but context cancelations seem common,
// likely because of an underlying connection being close,
// they are noisy and I don't think they communicate anything useful.
if !strings.Contains(err.Error(), "context canceled") {
level.Error(sp.log).Log("msg", "error processing requests from scheduler", "err", err, "addr", address)
}
backoff.Wait()
continue
}
backoff.Reset()
}
}
// process loops processing requests on an established stream.
func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_QuerierLoopClient, address string) error {
// Build a child context so we can cancel a query when the stream is closed.
ctx, cancel := context.WithCancel(c.Context())
defer cancel()
for {
request, err := c.Recv()
if err != nil {
return err
}
// Handle the request on a "background" goroutine, so we go back to
// blocking on c.Recv(). This allows us to detect the stream closing
// and cancel the query. We don't actually handle queries in parallel
// here, as we're running in lock step with the server - each Recv is
// paired with a Send.
go func() {
// We need to inject user into context for sending response back.
ctx := user.InjectOrgID(ctx, request.UserID)
tracer := opentracing.GlobalTracer()
// Ignore errors here. If we cannot get parent span, we just don't create new one.
parentSpanContext, _ := lokigrpc.GetParentSpanForRequest(tracer, request.HttpRequest)
if parentSpanContext != nil {
queueSpan, spanCtx := opentracing.StartSpanFromContextWithTracer(ctx, tracer, "querier_processor_runRequest", opentracing.ChildOf(parentSpanContext))
defer queueSpan.Finish()
ctx = spanCtx
}
logger := util_log.WithContext(ctx, sp.log)
sp.runRequest(ctx, logger, request.QueryID, request.FrontendAddress, request.StatsEnabled, request.HttpRequest)
// Report back to scheduler that processing of the query has finished.
if err := c.Send(&schedulerpb.QuerierToScheduler{}); err != nil {
level.Error(logger).Log("msg", "error notifying scheduler about finished query", "err", err, "addr", address)
}
}()
}
}
func (sp *schedulerProcessor) runRequest(ctx context.Context, logger log.Logger, queryID uint64, frontendAddress string, statsEnabled bool, request *httpgrpc.HTTPRequest) {
var stats *querier_stats.Stats
if statsEnabled {
stats, ctx = querier_stats.ContextWithEmptyStats(ctx)
}
response, err := sp.handler.Handle(ctx, request)
if err != nil {
var ok bool
response, ok = httpgrpc.HTTPResponseFromError(err)
if !ok {
response = &httpgrpc.HTTPResponse{
Code: http.StatusInternalServerError,
Body: []byte(err.Error()),
}
}
}
// Ensure responses that are too big are not retried.
if len(response.Body) >= sp.maxMessageSize {
level.Error(logger).Log("msg", "response larger than max message size", "size", len(response.Body), "maxMessageSize", sp.maxMessageSize)
errMsg := fmt.Sprintf("response larger than the max message size (%d vs %d)", len(response.Body), sp.maxMessageSize)
response = &httpgrpc.HTTPResponse{
Code: http.StatusRequestEntityTooLarge,
Body: []byte(errMsg),
}
}
c, err := sp.frontendPool.GetClientFor(frontendAddress)
if err == nil {
// Response is empty and uninteresting.
_, err = c.(frontendv2pb.FrontendForQuerierClient).QueryResult(ctx, &frontendv2pb.QueryResultRequest{
QueryID: queryID,
HttpResponse: response,
Stats: stats,
})
}
if err != nil {
level.Error(logger).Log("msg", "error notifying frontend about finished query", "err", err, "frontend", frontendAddress)
}
}
func (sp *schedulerProcessor) createFrontendClient(addr string) (client.PoolClient, error) {
opts, err := sp.grpcConfig.DialOption([]grpc.UnaryClientInterceptor{
otgrpc.OpenTracingClientInterceptor(opentracing.GlobalTracer()),
middleware.ClientUserHeaderInterceptor,
dskit_middleware.PrometheusGRPCUnaryInstrumentation(sp.frontendClientRequestDuration),
}, nil)
if err != nil {
return nil, err
}
conn, err := grpc.Dial(addr, opts...)
if err != nil {
return nil, err
}
return &frontendClient{
FrontendForQuerierClient: frontendv2pb.NewFrontendForQuerierClient(conn),
HealthClient: grpc_health_v1.NewHealthClient(conn),
conn: conn,
}, nil
}
type frontendClient struct {
frontendv2pb.FrontendForQuerierClient
grpc_health_v1.HealthClient
conn *grpc.ClientConn
}
func (fc *frontendClient) Close() error {
return fc.conn.Close()
}