package queue import ( "context" "errors" "fmt" "strconv" "sync" "testing" "time" "github.com/grafana/dskit/services" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/atomic" "golang.org/x/sync/errgroup" "github.com/grafana/loki/v3/pkg/util/constants" ) func BenchmarkGetNextRequest(b *testing.B) { const maxOutstandingPerTenant = 2 const numTenants = 50 const queriers = 5 type generateActor func(i int) []string benchCases := []struct { name string fn generateActor }{ { "without sub-queues", func(_ int) []string { return nil }, }, { "with 1 level of sub-queues", func(i int) []string { return []string{fmt.Sprintf("user-%d", i%11)} }, }, { "with 2 levels of sub-queues", func(i int) []string { return []string{fmt.Sprintf("user-%d", i%11), fmt.Sprintf("tool-%d", i%9)} }, }, } for _, benchCase := range benchCases { b.Run(benchCase.name, func(b *testing.B) { queues := make([]*RequestQueue, 0, b.N) for n := 0; n < b.N; n++ { queue := NewRequestQueue(maxOutstandingPerTenant, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler")) queues = append(queues, queue) for ix := 0; ix < queriers; ix++ { queue.RegisterConsumerConnection(fmt.Sprintf("querier-%d", ix)) } for i := 0; i < maxOutstandingPerTenant; i++ { for j := 0; j < numTenants; j++ { userID := strconv.Itoa(j) err := queue.Enqueue(userID, benchCase.fn(j), "request", nil) if err != nil { b.Fatal(err) } } } } querierNames := make([]string, queriers) for x := 0; x < queriers; x++ { querierNames[x] = fmt.Sprintf("querier-%d", x) } ctx := context.Background() b.ResetTimer() for i := 0; i < b.N; i++ { for j := 0; j < queriers; j++ { idx := StartIndexWithLocalQueue for x := 0; x < maxOutstandingPerTenant*numTenants/queriers; x++ { r, nidx, err := queues[i].Dequeue(ctx, idx, querierNames[j]) if r == nil { break } if err != nil { b.Fatal(err) } idx = nidx } } } }) } } func BenchmarkQueueRequest(b *testing.B) { const maxOutstandingPerTenant = 2 const numTenants = 50 const queriers = 5 queues := make([]*RequestQueue, 0, b.N) users := make([]string, 0, numTenants) requests := make([]string, 0, numTenants) for n := 0; n < b.N; n++ { q := NewRequestQueue(maxOutstandingPerTenant, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler")) for ix := 0; ix < queriers; ix++ { q.RegisterConsumerConnection(fmt.Sprintf("querier-%d", ix)) } queues = append(queues, q) for j := 0; j < numTenants; j++ { requests = append(requests, fmt.Sprintf("%d-%d", n, j)) users = append(users, strconv.Itoa(j)) } } b.ResetTimer() for n := 0; n < b.N; n++ { for i := 0; i < maxOutstandingPerTenant; i++ { for j := 0; j < numTenants; j++ { err := queues[n].Enqueue(users[j], nil, requests[j], nil) if err != nil { b.Fatal(err) } } } } } func TestRequestQueue_GetNextRequestForQuerier_ShouldGetRequestAfterReshardingBecauseQuerierHasBeenForgotten(t *testing.T) { const forgetDelay = 3 * time.Second queue := NewRequestQueue(1, forgetDelay, &mockQueueLimits{maxConsumers: 1}, NewMetrics(nil, constants.Loki, "query_scheduler")) // Start the queue service. ctx := context.Background() require.NoError(t, services.StartAndAwaitRunning(ctx, queue)) t.Cleanup(func() { require.NoError(t, services.StopAndAwaitTerminated(ctx, queue)) }) // Two queriers connect. queue.RegisterConsumerConnection("querier-1") queue.RegisterConsumerConnection("querier-2") // Querier-2 waits for a new request. querier2wg := sync.WaitGroup{} querier2wg.Add(1) go func() { defer querier2wg.Done() _, _, err := queue.Dequeue(ctx, StartIndex, "querier-2") require.NoError(t, err) }() // Querier-1 crashes (no graceful shutdown notification). queue.UnregisterConsumerConnection("querier-1") // Enqueue a request from an user which would be assigned to querier-1. // NOTE: "user-1" hash falls in the querier-1 shard. require.NoError(t, queue.Enqueue("user-1", nil, "request", nil)) startTime := time.Now() querier2wg.Wait() waitTime := time.Since(startTime) // We expect that querier-2 got the request only after querier-1 forget delay is passed. assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds()) } func TestContextCond(t *testing.T) { t.Run("wait until broadcast", func(t *testing.T) { t.Parallel() mtx := &sync.Mutex{} cond := contextCond{Cond: sync.NewCond(mtx)} doneWaiting := make(chan struct{}) mtx.Lock() go func() { cond.Wait(context.Background()) mtx.Unlock() close(doneWaiting) }() assertChanNotReceived(t, doneWaiting, 100*time.Millisecond, "cond.Wait returned, but it should not because we did not broadcast yet") cond.Broadcast() assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after broadcast") }) t.Run("wait until context deadline", func(t *testing.T) { t.Parallel() mtx := &sync.Mutex{} cond := contextCond{Cond: sync.NewCond(mtx)} doneWaiting := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background()) defer cancel() mtx.Lock() go func() { cond.Wait(ctx) mtx.Unlock() close(doneWaiting) }() assertChanNotReceived(t, doneWaiting, 100*time.Millisecond, "cond.Wait returned, but it should not because we did not broadcast yet and didn't cancel the context") cancel() assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after cancelling the context") }) t.Run("wait on already canceled context", func(t *testing.T) { // This test represents the racy real world scenario, // we don't know whether it's going to wait before the broadcast triggered by the context cancellation. t.Parallel() mtx := &sync.Mutex{} cond := contextCond{Cond: sync.NewCond(mtx)} doneWaiting := make(chan struct{}) alreadyCanceledContext, cancel := context.WithCancel(context.Background()) cancel() mtx.Lock() go func() { cond.Wait(alreadyCanceledContext) mtx.Unlock() close(doneWaiting) }() assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after cancelling the context") }) t.Run("wait on already canceled context, but it takes a while to wait", func(t *testing.T) { t.Parallel() mtx := &sync.Mutex{} cond := contextCond{ Cond: sync.NewCond(mtx), testHookBeforeWaiting: func() { // This makes the waiting goroutine so slow that out Wait(ctx) will need to broadcast once it sees it waiting. time.Sleep(250 * time.Millisecond) }, } doneWaiting := make(chan struct{}) alreadyCanceledContext, cancel := context.WithCancel(context.Background()) cancel() mtx.Lock() go func() { cond.Wait(alreadyCanceledContext) mtx.Unlock() close(doneWaiting) }() assertChanReceived(t, doneWaiting, time.Second, "cond.Wait did not return after 500ms") }) t.Run("lots of goroutines waiting at the same time, none of them misses it's broadcast from cancel", func(t *testing.T) { t.Parallel() mtx := &sync.Mutex{} cond := contextCond{ Cond: sync.NewCond(mtx), testHookBeforeWaiting: func() { // Wait just a little bit to create every goroutine time.Sleep(time.Millisecond) }, } const goroutines = 100 doneWaiting := make(chan struct{}, goroutines) release := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background()) cancel() for i := 0; i < goroutines; i++ { go func() { <-release mtx.Lock() cond.Wait(ctx) mtx.Unlock() doneWaiting <- struct{}{} }() } go func() { <-release cancel() }() close(release) assert.Eventually(t, func() bool { return len(doneWaiting) == goroutines }, time.Second, 10*time.Millisecond) }) } func TestMaxQueueSize(t *testing.T) { t.Run("queue size is tracked per tenant", func(t *testing.T) { maxSize := 3 queue := NewRequestQueue(maxSize, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler")) queue.RegisterConsumerConnection("querier") // enqueue maxSize items with different actors // different actors have individual channels with maxSize length assert.NoError(t, queue.Enqueue("tenant", []string{"user-a"}, 1, nil)) assert.NoError(t, queue.Enqueue("tenant", []string{"user-b"}, 2, nil)) assert.NoError(t, queue.Enqueue("tenant", []string{"user-c"}, 3, nil)) // max queue length per tenant is tracked globally for all actors within a tenant err := queue.Enqueue("tenant", []string{"user-a"}, 4, nil) assert.Equal(t, err, ErrTooManyRequests) // dequeue and enqueue some items _, _, err = queue.Dequeue(context.Background(), StartIndexWithLocalQueue, "querier") assert.NoError(t, err) _, _, err = queue.Dequeue(context.Background(), StartIndexWithLocalQueue, "querier") assert.NoError(t, err) assert.NoError(t, queue.Enqueue("tenant", []string{"user-a"}, 4, nil)) assert.NoError(t, queue.Enqueue("tenant", []string{"user-b"}, 5, nil)) err = queue.Enqueue("tenant", []string{"user-c"}, 6, nil) assert.Equal(t, err, ErrTooManyRequests) }) } type mockLimits struct { maxConsumer int } func (l *mockLimits) MaxConsumers(_ string, _ int) int { return l.maxConsumer } func Test_Queue_DequeueMany(t *testing.T) { tenantsQueueMaxSize := 100 tests := map[string]struct { tenantsCount int tasksPerTenant int consumersCount int dequeueBatchSize int maxConsumersPerTenant int consumerDelay time.Duration senderDelay time.Duration }{ "tenants-1_tasks-10_consumers-3_batch-2": { tenantsCount: 1, tasksPerTenant: 10, consumersCount: 3, dequeueBatchSize: 2, }, "tenants-10_tasks-10_consumers-2_batch-10": { tenantsCount: 10, tasksPerTenant: 10, consumersCount: 2, dequeueBatchSize: 10, }, "tenants-100_tasks-100_consumers-10_batch-10": { tenantsCount: 100, tasksPerTenant: 100, consumersCount: 10, dequeueBatchSize: 10, }, "tenants-100_tasks-100_consumers-10_batch-10_consumerDelay-10": { tenantsCount: 100, tasksPerTenant: 100, consumersCount: 10, dequeueBatchSize: 10, consumerDelay: 10 * time.Millisecond, }, "tenants-100_tasks-100_consumers-10_batch-10_consumerDelay-10_senderDelay-10": { tenantsCount: 100, tasksPerTenant: 100, consumersCount: 10, dequeueBatchSize: 10, consumerDelay: 10 * time.Millisecond, senderDelay: 10 * time.Millisecond, }, "tenants-10_tasks-50_consumers-10_batch-3": { tenantsCount: 10, tasksPerTenant: 50, consumersCount: 10, dequeueBatchSize: 3, }, "tenants-10_tasks-50_consumers-1_batch-5": { tenantsCount: 10, tasksPerTenant: 50, consumersCount: 1, dequeueBatchSize: 5, }, "tenants-5_tasks-10_consumers-1_batch-2": { tenantsCount: 5, tasksPerTenant: 10, consumersCount: 1, dequeueBatchSize: 2, }, "tenants-1_tasks-10_consumers-1_batch-2": { tenantsCount: 1, tasksPerTenant: 10, consumersCount: 1, dequeueBatchSize: 2, }, "tenants-1_tasks-10_consumers-10_batch-2": { tenantsCount: 1, tasksPerTenant: 10, consumersCount: 10, dequeueBatchSize: 2, }, "tenants-100_tasks-10_consumers-10_batch-6_maxConsumersPerTenant-6": { tenantsCount: 100, tasksPerTenant: 10, consumersCount: 10, dequeueBatchSize: 6, maxConsumersPerTenant: 4, }, "tenants-200_tasks-10_consumers-100_batch-999999_maxConsumersPerTenant-4": { tenantsCount: 200, tasksPerTenant: 10, consumersCount: 100, dequeueBatchSize: 999_999, maxConsumersPerTenant: 4, }, "tenants-10_tasks-100_consumers-20_batch-5_maxConsumersPerTenant-16": { tenantsCount: 10, tasksPerTenant: 100, consumersCount: 20, dequeueBatchSize: 5, maxConsumersPerTenant: 16, }, "tenants-1_tasks-100_consumers-16_batch-5_maxConsumersPerTenant-2": { tenantsCount: 1, tasksPerTenant: 100, consumersCount: 16, dequeueBatchSize: 5, maxConsumersPerTenant: 2, }, } for name, tt := range tests { t.Run(name, func(t *testing.T) { totalTasksCount := tt.tenantsCount * tt.tasksPerTenant limits := &mockLimits{maxConsumer: tt.maxConsumersPerTenant} require.LessOrEqual(t, tt.tasksPerTenant, tenantsQueueMaxSize, "test must be able to enqueue all the tasks without error") queue := NewRequestQueue(tenantsQueueMaxSize, 0, limits, NewMetrics(nil, constants.Loki, "query_scheduler")) for i := 0; i < tt.consumersCount; i++ { queue.RegisterConsumerConnection(createConsumerName(i)) } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() wg, _ := errgroup.WithContext(ctx) mtx := &sync.Mutex{} receivedTasksPerTenant := make(map[string]int, tt.tenantsCount) receivedTasksCount := atomic.NewInt32(0) for i := 0; i < tt.consumersCount; i++ { consumer := createConsumerName(i) wg.Go(func() error { idx := StartIndexWithLocalQueue for { tasks, newIdx, err := queue.DequeueMany(ctx, idx, consumer, tt.dequeueBatchSize) if err != nil && !errors.Is(err, context.Canceled) { return fmt.Errorf("error while dequeueing many task by %s: %w", consumer, err) } if err == nil && len(tasks) == 0 { return fmt.Errorf("%s must receive at least one item if there is no error, but got 0", consumer) } if len(tasks) > 1 && !isAllItemsSame(tasks) { return fmt.Errorf("expected all items to be from the same tenant, but they were not: %v", tasks) } time.Sleep(tt.consumerDelay) collectTasks(mtx, err, tasks, receivedTasksPerTenant) receivedTasksTotal := receivedTasksCount.Add(int32(len(tasks))) // put the slice back to the pool queue.ReleaseRequests(tasks) if receivedTasksTotal == int32(totalTasksCount) { //cancel the context to release the rest of the consumers once we received all the tasks from all the queues cancel() return nil } idx = newIdx } }) } enqueueTasksAsync(tt.tenantsCount, tt.tasksPerTenant, tt.senderDelay, wg, queue) require.NoError(t, wg.Wait()) require.Len(t, receivedTasksPerTenant, tt.tenantsCount) for tenant, actualTasksCount := range receivedTasksPerTenant { require.Equalf(t, tt.tasksPerTenant, actualTasksCount, "%s received less tasks than expected", tenant) } }) } } func enqueueTasksAsync(tenantsCount int, tasksPerTenant int, senderDelay time.Duration, wg *errgroup.Group, queue *RequestQueue) { for i := 0; i < tenantsCount; i++ { tenant := fmt.Sprintf("tenant-%d", i) wg.Go(func() error { for j := 0; j < tasksPerTenant; j++ { err := queue.Enqueue(tenant, []string{}, tenant, nil) if err != nil { return fmt.Errorf("error while enqueueing task for the %s : %w", tenant, err) } time.Sleep(senderDelay) } return nil }) } } func collectTasks(mtx *sync.Mutex, err error, tasks []Request, receivedTasksPerTenant map[string]int) { if err != nil { return } mtx.Lock() defer mtx.Unlock() //tenant name is sent as task tenant := tasks[0] receivedTasksPerTenant[fmt.Sprintf("%v", tenant)] += len(tasks) } func createConsumerName(i int) string { return fmt.Sprintf("consumer-%d", i) } func isAllItemsSame[T comparable](items []T) bool { firstItem := items[0] for i := 1; i < len(items); i++ { if items[i] != firstItem { return false } } return true } func assertChanReceived(t *testing.T, c chan struct{}, timeout time.Duration, msg string) { t.Helper() select { case <-c: case <-time.After(timeout): t.Fatal(msg) } } func assertChanNotReceived(t *testing.T, c chan struct{}, wait time.Duration, msg string, args ...interface{}) { t.Helper() select { case <-c: t.Fatalf(msg, args...) case <-time.After(wait): // OK! } }