mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
354 lines
9.9 KiB
354 lines
9.9 KiB
// SPDX-License-Identifier: AGPL-3.0-only
|
|
// Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/queue/user_queues.go
|
|
// Provenance-includes-license: Apache-2.0
|
|
// Provenance-includes-copyright: The Cortex Authors.
|
|
|
|
package queue
|
|
|
|
import (
|
|
"fmt"
|
|
"math/rand"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/go-kit/log/level"
|
|
"github.com/grafana/dskit/tenant"
|
|
|
|
"github.com/grafana/loki/v3/pkg/util"
|
|
util_log "github.com/grafana/loki/v3/pkg/util/log"
|
|
"github.com/grafana/loki/v3/pkg/util/validation"
|
|
)
|
|
|
|
type intPointerMap map[string]*int
|
|
|
|
func (tqs intPointerMap) Inc(key string) int {
|
|
ptr, ok := tqs[key]
|
|
if !ok {
|
|
size := 1
|
|
tqs[key] = &size
|
|
return size
|
|
}
|
|
(*ptr)++
|
|
return *ptr
|
|
}
|
|
|
|
func (tqs intPointerMap) Dec(key string) int {
|
|
ptr, ok := tqs[key]
|
|
if !ok {
|
|
return 0
|
|
}
|
|
(*ptr)--
|
|
if *ptr == 0 {
|
|
delete(tqs, key)
|
|
}
|
|
return *ptr
|
|
}
|
|
|
|
// consumer holds information about a consumer registered in the queue.
|
|
type consumer struct {
|
|
// Number of active connections.
|
|
connections int
|
|
|
|
// True if the consumer notified it's gracefully shutting down.
|
|
shuttingDown bool
|
|
|
|
// When the last connection has been unregistered.
|
|
disconnectedAt time.Time
|
|
}
|
|
|
|
// This struct holds tenant queues for pending requests. It also keeps track of connected consumers,
|
|
// and mapping between tenants and consumers.
|
|
type tenantQueues struct {
|
|
mapping *Mapping[*tenantQueue]
|
|
|
|
maxUserQueueSize int
|
|
perUserQueueLen intPointerMap
|
|
|
|
// How long to wait before removing a consumer which has got disconnected
|
|
// but hasn't notified about a graceful shutdown.
|
|
forgetDelay time.Duration
|
|
|
|
// Tracks consumers registered to the queue.
|
|
consumers map[string]*consumer
|
|
|
|
// sortedConsumer list of consumer IDs, used when creating per-user shard.
|
|
sortedConsumers []string
|
|
|
|
limits Limits
|
|
}
|
|
|
|
type Queue interface {
|
|
Chan() RequestChannel
|
|
Dequeue() Request
|
|
Name() string
|
|
Len() int
|
|
}
|
|
|
|
type Mapable interface {
|
|
*tenantQueue | *TreeQueue
|
|
// https://github.com/golang/go/issues/48522#issuecomment-924348755
|
|
Pos() QueueIndex
|
|
SetPos(index QueueIndex)
|
|
}
|
|
|
|
type tenantQueue struct {
|
|
*TreeQueue
|
|
|
|
// If not nil, only these consumers can handle user requests. If nil, all consumers can.
|
|
// We set this to nil if number of available consumers <= MaxConsumers.
|
|
consumers map[string]struct{}
|
|
|
|
// Seed for shuffle sharding of consumers. This seed is based on userID only and is therefore consistent
|
|
// between different frontends.
|
|
seed int64
|
|
}
|
|
|
|
func newTenantQueues(maxUserQueueSize int, forgetDelay time.Duration, limits Limits) *tenantQueues {
|
|
mm := &Mapping[*tenantQueue]{}
|
|
mm.Init(64)
|
|
return &tenantQueues{
|
|
mapping: mm,
|
|
maxUserQueueSize: maxUserQueueSize,
|
|
perUserQueueLen: make(intPointerMap),
|
|
forgetDelay: forgetDelay,
|
|
consumers: map[string]*consumer{},
|
|
sortedConsumers: nil,
|
|
limits: limits,
|
|
}
|
|
}
|
|
|
|
func (q *tenantQueues) hasNoTenantQueues() bool {
|
|
return q.mapping.Len() == 0
|
|
}
|
|
|
|
func (q *tenantQueues) deleteQueue(tenant string) {
|
|
q.mapping.Remove(tenant)
|
|
}
|
|
|
|
// Returns existing or new queue for a tenant.
|
|
func (q *tenantQueues) getOrAddQueue(tenantID string, path []string) (Queue, error) {
|
|
// Empty tenant is not allowed, as that would break our tenants list ("" is used for free spot).
|
|
if tenantID == "" {
|
|
return nil, fmt.Errorf("empty tenant is not allowed")
|
|
}
|
|
|
|
// extract tenantIDs to compute limits for multi-tenant queries
|
|
tenantIDs, err := tenant.TenantIDsFromOrgID(tenantID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("extract tenant ids: %w", err)
|
|
}
|
|
|
|
uq := q.mapping.GetByKey(tenantID)
|
|
if uq == nil {
|
|
uq = &tenantQueue{
|
|
seed: util.ShuffleShardSeed(tenantID, ""),
|
|
}
|
|
uq.TreeQueue = newTreeQueue(q.maxUserQueueSize, tenantID)
|
|
q.mapping.Put(tenantID, uq)
|
|
}
|
|
|
|
consumersToSelect := validation.SmallestPositiveNonZeroIntPerTenant(
|
|
tenantIDs,
|
|
func(tenantID string) int {
|
|
return q.limits.MaxConsumers(tenantID, len(q.sortedConsumers))
|
|
},
|
|
)
|
|
|
|
if len(uq.consumers) != consumersToSelect {
|
|
uq.consumers = shuffleConsumersForTenants(uq.seed, consumersToSelect, q.sortedConsumers, nil)
|
|
}
|
|
|
|
if len(path) == 0 {
|
|
return uq, nil
|
|
}
|
|
return uq.add(path), nil
|
|
}
|
|
|
|
// Finds next queue for the consumer. To support fair scheduling between users, client is expected
|
|
// to pass last user index returned by this function as argument. Is there was no previous
|
|
// last user index, use -1.
|
|
func (q *tenantQueues) getNextQueueForConsumer(lastUserIndex QueueIndex, consumerID string) (Queue, string, QueueIndex) {
|
|
uid := lastUserIndex
|
|
|
|
// at the RequestQueue level we don't have local queues, so start index is -1
|
|
if uid == StartIndexWithLocalQueue {
|
|
uid = StartIndex
|
|
}
|
|
|
|
// Ensure the consumer is not shutting down. If the consumer is shutting down, we shouldn't forward
|
|
// any more queries to it.
|
|
if info := q.consumers[consumerID]; info == nil || info.shuttingDown {
|
|
return nil, "", uid
|
|
}
|
|
|
|
maxIters := len(q.mapping.keys) + 1
|
|
for iters := 0; iters < maxIters; iters++ {
|
|
tq, err := q.mapping.GetNext(uid)
|
|
if err == ErrOutOfBounds {
|
|
uid = StartIndex
|
|
continue
|
|
}
|
|
if tq == nil {
|
|
break
|
|
}
|
|
uid = tq.pos
|
|
|
|
if tq.consumers != nil {
|
|
if _, ok := tq.consumers[consumerID]; !ok {
|
|
// This consumer is not handling the user.
|
|
continue
|
|
}
|
|
}
|
|
return tq, tq.name, uid
|
|
}
|
|
|
|
return nil, "", uid
|
|
}
|
|
|
|
func (q *tenantQueues) addConsumerToConnection(consumerID string) {
|
|
info := q.consumers[consumerID]
|
|
if info != nil {
|
|
info.connections++
|
|
|
|
// Reset in case the consumer re-connected while it was in the forget waiting period.
|
|
info.shuttingDown = false
|
|
info.disconnectedAt = time.Time{}
|
|
|
|
return
|
|
}
|
|
|
|
// First connection from this consumer.
|
|
q.consumers[consumerID] = &consumer{connections: 1}
|
|
q.sortedConsumers = append(q.sortedConsumers, consumerID)
|
|
sort.Strings(q.sortedConsumers)
|
|
|
|
q.recomputeUserConsumers()
|
|
}
|
|
|
|
func (q *tenantQueues) removeConsumerConnection(consumerID string, now time.Time) {
|
|
info := q.consumers[consumerID]
|
|
if info == nil || info.connections <= 0 {
|
|
panic("unexpected number of connections for consumer")
|
|
}
|
|
|
|
// Decrease the number of active connections.
|
|
info.connections--
|
|
if info.connections > 0 {
|
|
return
|
|
}
|
|
|
|
// There no more active connections. If the forget delay is configured then
|
|
// we can remove it only if consumer has announced a graceful shutdown.
|
|
if info.shuttingDown || q.forgetDelay == 0 {
|
|
q.removeConsumer(consumerID)
|
|
return
|
|
}
|
|
|
|
// No graceful shutdown has been notified yet, so we should track the current time
|
|
// so that we'll remove the consumer as soon as we receive the graceful shutdown
|
|
// notification (if any) or once the threshold expires.
|
|
info.disconnectedAt = now
|
|
}
|
|
|
|
func (q *tenantQueues) removeConsumer(consumerID string) {
|
|
delete(q.consumers, consumerID)
|
|
|
|
ix := sort.SearchStrings(q.sortedConsumers, consumerID)
|
|
if ix >= len(q.sortedConsumers) || q.sortedConsumers[ix] != consumerID {
|
|
panic("incorrect state of sorted consumers")
|
|
}
|
|
|
|
q.sortedConsumers = append(q.sortedConsumers[:ix], q.sortedConsumers[ix+1:]...)
|
|
|
|
q.recomputeUserConsumers()
|
|
}
|
|
|
|
// notifyQuerierShutdown records that a consumer has sent notification about a graceful shutdown.
|
|
func (q *tenantQueues) notifyQuerierShutdown(consumerID string) {
|
|
info := q.consumers[consumerID]
|
|
if info == nil {
|
|
// The consumer may have already been removed, so we just ignore it.
|
|
return
|
|
}
|
|
|
|
// If there are no more connections, we should remove the consumer.
|
|
if info.connections == 0 {
|
|
q.removeConsumer(consumerID)
|
|
return
|
|
}
|
|
|
|
// Otherwise we should annotate we received a graceful shutdown notification
|
|
// and the consumer will be removed once all connections are unregistered.
|
|
info.shuttingDown = true
|
|
}
|
|
|
|
// forgetDisconnectedConsumers removes all disconnected consumer that have gone since at least
|
|
// the forget delay. Returns the number of forgotten consumers.
|
|
func (q *tenantQueues) forgetDisconnectedConsumers(now time.Time) int {
|
|
// Nothing to do if the forget delay is disabled.
|
|
if q.forgetDelay == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Remove all consumers with no connections that have gone since at least the forget delay.
|
|
threshold := now.Add(-q.forgetDelay)
|
|
forgotten := 0
|
|
|
|
for id := range q.consumers {
|
|
if info := q.consumers[id]; info.connections == 0 && info.disconnectedAt.Before(threshold) {
|
|
q.removeConsumer(id)
|
|
forgotten++
|
|
}
|
|
}
|
|
|
|
return forgotten
|
|
}
|
|
|
|
func (q *tenantQueues) recomputeUserConsumers() {
|
|
scratchpad := make([]string, 0, len(q.sortedConsumers))
|
|
|
|
for _, tenantID := range q.mapping.Keys() {
|
|
if uq := q.mapping.GetByKey(tenantID); uq != nil {
|
|
tenantIDs, err := tenant.TenantIDsFromOrgID(tenantID)
|
|
if err != nil {
|
|
// this is unlikely to happen since we do tenantID validation when creating the queue.
|
|
level.Error(util_log.Logger).Log("msg", "failed to shuffle consumers because of errors in tenantID extraction", "tenant", tenantID, "error", err)
|
|
continue
|
|
}
|
|
|
|
consumersToSelect := validation.SmallestPositiveNonZeroIntPerTenant(
|
|
tenantIDs,
|
|
func(tenantID string) int {
|
|
return q.limits.MaxConsumers(tenantID, len(q.sortedConsumers))
|
|
},
|
|
)
|
|
uq.consumers = shuffleConsumersForTenants(uq.seed, consumersToSelect, q.sortedConsumers, scratchpad)
|
|
}
|
|
}
|
|
}
|
|
|
|
// shuffleConsumersForTenants returns nil if consumersToSelect is 0 or there are not enough consumers to select from.
|
|
// In that case *all* consumers should be used.
|
|
// Scratchpad is used for shuffling, to avoid new allocations. If nil, new slice is allocated.
|
|
func shuffleConsumersForTenants(userSeed int64, consumersToSelect int, allSortedConsumers []string, scratchpad []string) map[string]struct{} {
|
|
if consumersToSelect == 0 || len(allSortedConsumers) <= consumersToSelect {
|
|
return nil
|
|
}
|
|
|
|
result := make(map[string]struct{}, consumersToSelect)
|
|
rnd := rand.New(rand.NewSource(userSeed))
|
|
|
|
scratchpad = scratchpad[:0]
|
|
scratchpad = append(scratchpad, allSortedConsumers...)
|
|
|
|
last := len(scratchpad) - 1
|
|
for i := 0; i < consumersToSelect; i++ {
|
|
r := rnd.Intn(last + 1)
|
|
result[scratchpad[r]] = struct{}{}
|
|
// move selected item to the end, it won't be selected anymore.
|
|
scratchpad[r], scratchpad[last] = scratchpad[last], scratchpad[r]
|
|
last--
|
|
}
|
|
|
|
return result
|
|
}
|
|
|