Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/bloomgateway/worker.go

129 lines
3.8 KiB

package bloomgateway
import (
"context"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/services"
"github.com/pkg/errors"
"go.uber.org/atomic"
"github.com/grafana/loki/v3/pkg/queue"
"github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper"
)
const (
labelSuccess = "success"
labelFailure = "failure"
)
type workerConfig struct {
maxItems int
queryConcurrency int
}
// worker is a datastructure that consumes tasks from the request queue,
// processes them and returns the result/error back to the response channels of
// the tasks.
// It is responsible for multiplexing tasks so they can be processes in a more
// efficient way.
type worker struct {
services.Service
id string
cfg workerConfig
queue *queue.RequestQueue
store bloomshipper.Store
pending *atomic.Int64
logger log.Logger
metrics *workerMetrics
}
func newWorker(id string, cfg workerConfig, queue *queue.RequestQueue, store bloomshipper.Store, pending *atomic.Int64, logger log.Logger, metrics *workerMetrics) *worker {
w := &worker{
id: id,
cfg: cfg,
queue: queue,
store: store,
pending: pending,
logger: log.With(logger, "worker", id),
metrics: metrics,
}
w.Service = services.NewBasicService(w.starting, w.running, w.stopping).WithName(id)
return w
}
func (w *worker) starting(_ context.Context) error {
level.Debug(w.logger).Log("msg", "starting worker")
w.queue.RegisterConsumerConnection(w.id)
return nil
}
func (w *worker) running(_ context.Context) error {
idx := queue.StartIndexWithLocalQueue
p := newProcessor(w.id, w.cfg.queryConcurrency, w.store, w.logger, w.metrics)
for st := w.State(); st == services.Running || st == services.Stopping; {
taskCtx := context.Background()
start := time.Now()
DequeueMany fix (#11797) **What this PR does / why we need it**: Previously we relied only on the index of the tenant's queue to read the requests from the same tenant's queue. However, as long as the queue is aggressively used by the consumers in parallel, there are a few edge cases when knowing the index of last used tenant's queue is not enough to guarantee that we dequeue items from exactly the same tenant's queue. **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Signed-off-by: Vladyslav Diachenko <vlad.diachenko@grafana.com>
2 years ago
items, newIdx, err := w.queue.DequeueMany(taskCtx, idx, w.id, w.cfg.maxItems)
w.metrics.dequeueDuration.WithLabelValues(w.id).Observe(time.Since(start).Seconds())
if err != nil {
// We only return an error if the queue is stopped and dequeuing did not yield any items
if err == queue.ErrStopped && len(items) == 0 {
return err
}
w.metrics.tasksDequeued.WithLabelValues(w.id, labelFailure).Inc()
level.Error(w.logger).Log("msg", "failed to dequeue tasks", "err", err, "items", len(items))
}
idx = newIdx
if len(items) == 0 {
w.queue.ReleaseRequests(items)
continue
}
w.metrics.tasksDequeued.WithLabelValues(w.id, labelSuccess).Add(float64(len(items)))
tasks := make([]Task, 0, len(items))
for _, item := range items {
task, ok := item.(Task)
if !ok {
// This really should never happen, because only the bloom gateway itself can enqueue tasks.
w.queue.ReleaseRequests(items)
return errors.Errorf("failed to cast dequeued item to Task: %v", item)
}
_ = w.pending.Dec()
w.metrics.queueDuration.WithLabelValues(w.id).Observe(time.Since(task.enqueueTime).Seconds())
FromContext(task.ctx).AddQueueTime(time.Since(task.enqueueTime))
tasks = append(tasks, task)
}
start = time.Now()
err = p.processTasks(taskCtx, tasks)
if err != nil {
w.metrics.processDuration.WithLabelValues(w.id, labelFailure).Observe(time.Since(start).Seconds())
w.metrics.tasksProcessed.WithLabelValues(w.id, labelFailure).Add(float64(len(tasks)))
level.Error(w.logger).Log("msg", "failed to process tasks", "err", err)
} else {
w.metrics.processDuration.WithLabelValues(w.id, labelSuccess).Observe(time.Since(start).Seconds())
w.metrics.tasksProcessed.WithLabelValues(w.id, labelSuccess).Add(float64(len(tasks)))
}
// return dequeued items back to the pool
w.queue.ReleaseRequests(items)
}
return nil
}
func (w *worker) stopping(err error) error {
level.Debug(w.logger).Log("msg", "stopping worker", "err", err)
w.queue.UnregisterConsumerConnection(w.id)
return nil
}