Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/queue/tenant_queues.go

354 lines
9.9 KiB

// SPDX-License-Identifier: AGPL-3.0-only
// Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/scheduler/queue/user_queues.go
// Provenance-includes-license: Apache-2.0
// Provenance-includes-copyright: The Cortex Authors.
package queue
import (
"fmt"
"math/rand"
"sort"
"time"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/tenant"
"github.com/grafana/loki/v3/pkg/util"
util_log "github.com/grafana/loki/v3/pkg/util/log"
"github.com/grafana/loki/v3/pkg/util/validation"
)
type intPointerMap map[string]*int
func (tqs intPointerMap) Inc(key string) int {
ptr, ok := tqs[key]
if !ok {
size := 1
tqs[key] = &size
return size
}
(*ptr)++
return *ptr
}
func (tqs intPointerMap) Dec(key string) int {
ptr, ok := tqs[key]
if !ok {
return 0
}
(*ptr)--
if *ptr == 0 {
delete(tqs, key)
}
return *ptr
}
// consumer holds information about a consumer registered in the queue.
type consumer struct {
// Number of active connections.
connections int
// True if the consumer notified it's gracefully shutting down.
shuttingDown bool
// When the last connection has been unregistered.
disconnectedAt time.Time
}
// This struct holds tenant queues for pending requests. It also keeps track of connected consumers,
// and mapping between tenants and consumers.
type tenantQueues struct {
mapping *Mapping[*tenantQueue]
maxUserQueueSize int
perUserQueueLen intPointerMap
// How long to wait before removing a consumer which has got disconnected
// but hasn't notified about a graceful shutdown.
forgetDelay time.Duration
// Tracks consumers registered to the queue.
consumers map[string]*consumer
// sortedConsumer list of consumer IDs, used when creating per-user shard.
sortedConsumers []string
limits Limits
}
type Queue interface {
Chan() RequestChannel
Dequeue() Request
Name() string
Len() int
}
type Mapable interface {
*tenantQueue | *TreeQueue
// https://github.com/golang/go/issues/48522#issuecomment-924348755
Pos() QueueIndex
SetPos(index QueueIndex)
}
type tenantQueue struct {
*TreeQueue
// If not nil, only these consumers can handle user requests. If nil, all consumers can.
// We set this to nil if number of available consumers <= MaxConsumers.
consumers map[string]struct{}
// Seed for shuffle sharding of consumers. This seed is based on userID only and is therefore consistent
// between different frontends.
seed int64
}
func newTenantQueues(maxUserQueueSize int, forgetDelay time.Duration, limits Limits) *tenantQueues {
mm := &Mapping[*tenantQueue]{}
mm.Init(64)
return &tenantQueues{
mapping: mm,
maxUserQueueSize: maxUserQueueSize,
perUserQueueLen: make(intPointerMap),
forgetDelay: forgetDelay,
consumers: map[string]*consumer{},
sortedConsumers: nil,
limits: limits,
}
}
func (q *tenantQueues) hasNoTenantQueues() bool {
return q.mapping.Len() == 0
}
func (q *tenantQueues) deleteQueue(tenant string) {
q.mapping.Remove(tenant)
}
// Returns existing or new queue for a tenant.
func (q *tenantQueues) getOrAddQueue(tenantID string, path []string) (Queue, error) {
// Empty tenant is not allowed, as that would break our tenants list ("" is used for free spot).
if tenantID == "" {
return nil, fmt.Errorf("empty tenant is not allowed")
}
// extract tenantIDs to compute limits for multi-tenant queries
tenantIDs, err := tenant.TenantIDsFromOrgID(tenantID)
if err != nil {
return nil, fmt.Errorf("extract tenant ids: %w", err)
}
uq := q.mapping.GetByKey(tenantID)
if uq == nil {
uq = &tenantQueue{
seed: util.ShuffleShardSeed(tenantID, ""),
}
uq.TreeQueue = newTreeQueue(q.maxUserQueueSize, tenantID)
q.mapping.Put(tenantID, uq)
}
consumersToSelect := validation.SmallestPositiveNonZeroIntPerTenant(
tenantIDs,
func(tenantID string) int {
return q.limits.MaxConsumers(tenantID, len(q.sortedConsumers))
},
)
if len(uq.consumers) != consumersToSelect {
uq.consumers = shuffleConsumersForTenants(uq.seed, consumersToSelect, q.sortedConsumers, nil)
}
if len(path) == 0 {
return uq, nil
}
return uq.add(path), nil
}
// Finds next queue for the consumer. To support fair scheduling between users, client is expected
// to pass last user index returned by this function as argument. Is there was no previous
// last user index, use -1.
func (q *tenantQueues) getNextQueueForConsumer(lastUserIndex QueueIndex, consumerID string) (Queue, string, QueueIndex) {
uid := lastUserIndex
// at the RequestQueue level we don't have local queues, so start index is -1
if uid == StartIndexWithLocalQueue {
uid = StartIndex
}
// Ensure the consumer is not shutting down. If the consumer is shutting down, we shouldn't forward
// any more queries to it.
if info := q.consumers[consumerID]; info == nil || info.shuttingDown {
return nil, "", uid
}
maxIters := len(q.mapping.keys) + 1
for iters := 0; iters < maxIters; iters++ {
tq, err := q.mapping.GetNext(uid)
if err == ErrOutOfBounds {
uid = StartIndex
continue
}
if tq == nil {
break
}
uid = tq.pos
if tq.consumers != nil {
if _, ok := tq.consumers[consumerID]; !ok {
// This consumer is not handling the user.
continue
}
}
return tq, tq.name, uid
}
return nil, "", uid
}
func (q *tenantQueues) addConsumerToConnection(consumerID string) {
info := q.consumers[consumerID]
if info != nil {
info.connections++
// Reset in case the consumer re-connected while it was in the forget waiting period.
info.shuttingDown = false
info.disconnectedAt = time.Time{}
return
}
// First connection from this consumer.
q.consumers[consumerID] = &consumer{connections: 1}
q.sortedConsumers = append(q.sortedConsumers, consumerID)
sort.Strings(q.sortedConsumers)
q.recomputeUserConsumers()
}
func (q *tenantQueues) removeConsumerConnection(consumerID string, now time.Time) {
info := q.consumers[consumerID]
if info == nil || info.connections <= 0 {
panic("unexpected number of connections for consumer")
}
// Decrease the number of active connections.
info.connections--
if info.connections > 0 {
return
}
// There no more active connections. If the forget delay is configured then
// we can remove it only if consumer has announced a graceful shutdown.
if info.shuttingDown || q.forgetDelay == 0 {
q.removeConsumer(consumerID)
return
}
// No graceful shutdown has been notified yet, so we should track the current time
// so that we'll remove the consumer as soon as we receive the graceful shutdown
// notification (if any) or once the threshold expires.
info.disconnectedAt = now
}
func (q *tenantQueues) removeConsumer(consumerID string) {
delete(q.consumers, consumerID)
ix := sort.SearchStrings(q.sortedConsumers, consumerID)
if ix >= len(q.sortedConsumers) || q.sortedConsumers[ix] != consumerID {
panic("incorrect state of sorted consumers")
}
q.sortedConsumers = append(q.sortedConsumers[:ix], q.sortedConsumers[ix+1:]...)
q.recomputeUserConsumers()
}
// notifyQuerierShutdown records that a consumer has sent notification about a graceful shutdown.
func (q *tenantQueues) notifyQuerierShutdown(consumerID string) {
info := q.consumers[consumerID]
if info == nil {
// The consumer may have already been removed, so we just ignore it.
return
}
// If there are no more connections, we should remove the consumer.
if info.connections == 0 {
q.removeConsumer(consumerID)
return
}
// Otherwise we should annotate we received a graceful shutdown notification
// and the consumer will be removed once all connections are unregistered.
info.shuttingDown = true
}
// forgetDisconnectedConsumers removes all disconnected consumer that have gone since at least
// the forget delay. Returns the number of forgotten consumers.
func (q *tenantQueues) forgetDisconnectedConsumers(now time.Time) int {
// Nothing to do if the forget delay is disabled.
if q.forgetDelay == 0 {
return 0
}
// Remove all consumers with no connections that have gone since at least the forget delay.
threshold := now.Add(-q.forgetDelay)
forgotten := 0
for id := range q.consumers {
if info := q.consumers[id]; info.connections == 0 && info.disconnectedAt.Before(threshold) {
q.removeConsumer(id)
forgotten++
}
}
return forgotten
}
func (q *tenantQueues) recomputeUserConsumers() {
scratchpad := make([]string, 0, len(q.sortedConsumers))
for _, tenantID := range q.mapping.Keys() {
if uq := q.mapping.GetByKey(tenantID); uq != nil {
tenantIDs, err := tenant.TenantIDsFromOrgID(tenantID)
if err != nil {
// this is unlikely to happen since we do tenantID validation when creating the queue.
level.Error(util_log.Logger).Log("msg", "failed to shuffle consumers because of errors in tenantID extraction", "tenant", tenantID, "error", err)
continue
}
consumersToSelect := validation.SmallestPositiveNonZeroIntPerTenant(
tenantIDs,
func(tenantID string) int {
return q.limits.MaxConsumers(tenantID, len(q.sortedConsumers))
},
)
uq.consumers = shuffleConsumersForTenants(uq.seed, consumersToSelect, q.sortedConsumers, scratchpad)
}
}
}
// shuffleConsumersForTenants returns nil if consumersToSelect is 0 or there are not enough consumers to select from.
// In that case *all* consumers should be used.
// Scratchpad is used for shuffling, to avoid new allocations. If nil, new slice is allocated.
func shuffleConsumersForTenants(userSeed int64, consumersToSelect int, allSortedConsumers []string, scratchpad []string) map[string]struct{} {
if consumersToSelect == 0 || len(allSortedConsumers) <= consumersToSelect {
return nil
}
result := make(map[string]struct{}, consumersToSelect)
rnd := rand.New(rand.NewSource(userSeed))
scratchpad = scratchpad[:0]
scratchpad = append(scratchpad, allSortedConsumers...)
last := len(scratchpad) - 1
for i := 0; i < consumersToSelect; i++ {
r := rnd.Intn(last + 1)
result[scratchpad[r]] = struct{}{}
// move selected item to the end, it won't be selected anymore.
scratchpad[r], scratchpad[last] = scratchpad[last], scratchpad[r]
last--
}
return result
}