Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/queue/queue_test.go

571 lines
16 KiB

package queue
import (
"context"
DequeueMany fix (#11797) **What this PR does / why we need it**: Previously we relied only on the index of the tenant's queue to read the requests from the same tenant's queue. However, as long as the queue is aggressively used by the consumers in parallel, there are a few edge cases when knowing the index of last used tenant's queue is not enough to guarantee that we dequeue items from exactly the same tenant's queue. **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Signed-off-by: Vladyslav Diachenko <vlad.diachenko@grafana.com>
2 years ago
"errors"
"fmt"
"strconv"
"sync"
"testing"
"time"
"github.com/grafana/dskit/services"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
DequeueMany fix (#11797) **What this PR does / why we need it**: Previously we relied only on the index of the tenant's queue to read the requests from the same tenant's queue. However, as long as the queue is aggressively used by the consumers in parallel, there are a few edge cases when knowing the index of last used tenant's queue is not enough to guarantee that we dequeue items from exactly the same tenant's queue. **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Signed-off-by: Vladyslav Diachenko <vlad.diachenko@grafana.com>
2 years ago
"go.uber.org/atomic"
"golang.org/x/sync/errgroup"
"github.com/grafana/loki/pkg/util/constants"
)
func BenchmarkGetNextRequest(b *testing.B) {
const maxOutstandingPerTenant = 2
const numTenants = 50
const queriers = 5
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
type generateActor func(i int) []string
benchCases := []struct {
name string
fn generateActor
}{
{
"without sub-queues",
func(i int) []string { return nil },
},
{
"with 1 level of sub-queues",
func(i int) []string { return []string{fmt.Sprintf("user-%d", i%11)} },
},
{
"with 2 levels of sub-queues",
func(i int) []string { return []string{fmt.Sprintf("user-%d", i%11), fmt.Sprintf("tool-%d", i%9)} },
},
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
for _, benchCase := range benchCases {
benchCase := benchCase
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
b.Run(benchCase.name, func(b *testing.B) {
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
queues := make([]*RequestQueue, 0, b.N)
for n := 0; n < b.N; n++ {
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
queue := NewRequestQueue(maxOutstandingPerTenant, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler"))
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
queues = append(queues, queue)
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
for ix := 0; ix < queriers; ix++ {
queue.RegisterConsumerConnection(fmt.Sprintf("querier-%d", ix))
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
for i := 0; i < maxOutstandingPerTenant; i++ {
for j := 0; j < numTenants; j++ {
userID := strconv.Itoa(j)
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
err := queue.Enqueue(userID, benchCase.fn(j), "request", nil)
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
if err != nil {
b.Fatal(err)
}
}
}
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
querierNames := make([]string, queriers)
for x := 0; x < queriers; x++ {
querierNames[x] = fmt.Sprintf("querier-%d", x)
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
ctx := context.Background()
b.ResetTimer()
for i := 0; i < b.N; i++ {
for j := 0; j < queriers; j++ {
idx := StartIndexWithLocalQueue
for x := 0; x < maxOutstandingPerTenant*numTenants/queriers; x++ {
r, nidx, err := queues[i].Dequeue(ctx, idx, querierNames[j])
if r == nil {
break
}
if err != nil {
b.Fatal(err)
}
idx = nidx
}
}
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
})
}
Scheduler: Add query fairness control across multiple actors within a tenant (#8752) **What this PR does / why we need it**: This PR wires up the scheduler with the hierarchical queues. It is the last PR to implement https://github.com/grafana/loki/pull/8585. When these changes are in place, the client performing query requests can control their QoS (query fairness) using the `X-Actor-Path` HTTP header. This header controls in which sub-queue of the tenant's scheduler queue the query request is enqueued. The place within the hierarchy where it is enqueued defines the probability with which the request gets dequeued. A common use-case for this QoS control is giving each Grafana user within a tenant their fair share of query execution time. Any documentation is still missing and will be provided by follow-up PRs. **Special notes for your reviewer**: ```console $ gotest -count=1 -v ./pkg/scheduler/queue/... -test.run=TestQueryFairness === RUN TestQueryFairness === RUN TestQueryFairness/use_hierarchical_queues_=_false dequeue_qos_test.go:109: duration actor a 2.007765568s dequeue_qos_test.go:109: duration actor b 2.209088331s dequeue_qos_test.go:112: total duration 2.209280772s === RUN TestQueryFairness/use_hierarchical_queues_=_true dequeue_qos_test.go:109: duration actor b 605.283144ms dequeue_qos_test.go:109: duration actor a 2.270931324s dequeue_qos_test.go:112: total duration 2.271108551s --- PASS: TestQueryFairness (4.48s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_false (2.21s) --- PASS: TestQueryFairness/use_hierarchical_queues_=_true (2.27s) PASS ok github.com/grafana/loki/pkg/scheduler/queue 4.491s ``` ```console $ gotest -count=5 -v ./pkg/scheduler/queue/... -bench=Benchmark -test.run=^$ -benchtime=10000x -benchmem goos: linux goarch: amd64 pkg: github.com/grafana/loki/pkg/scheduler/queue cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz BenchmarkGetNextRequest BenchmarkGetNextRequest/without_sub-queues BenchmarkGetNextRequest/without_sub-queues-8 10000 29337 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21348 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21595 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21189 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/without_sub-queues-8 10000 21602 ns/op 1600 B/op 100 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33770 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33596 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 34432 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33760 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_1_level_of_sub-queues-8 10000 33664 ns/op 2400 B/op 200 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 71405 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 59472 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 117163 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 106505 ns/op 3200 B/op 300 allocs/op BenchmarkGetNextRequest/with_2_levels_of_sub-queues-8 10000 64374 ns/op 3200 B/op 300 allocs/op BenchmarkQueueRequest BenchmarkQueueRequest-8 10000 168391 ns/op 320588 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 166203 ns/op 320587 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 149518 ns/op 320584 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 219776 ns/op 320583 B/op 1156 allocs/op BenchmarkQueueRequest-8 10000 185198 ns/op 320597 B/op 1156 allocs/op PASS ok github.com/grafana/loki/pkg/scheduler/queue 64.648s ``` Signed-off-by: Christian Haudum <christian.haudum@gmail.com>
3 years ago
}
func BenchmarkQueueRequest(b *testing.B) {
const maxOutstandingPerTenant = 2
const numTenants = 50
const queriers = 5
queues := make([]*RequestQueue, 0, b.N)
users := make([]string, 0, numTenants)
requests := make([]string, 0, numTenants)
for n := 0; n < b.N; n++ {
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
q := NewRequestQueue(maxOutstandingPerTenant, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler"))
for ix := 0; ix < queriers; ix++ {
q.RegisterConsumerConnection(fmt.Sprintf("querier-%d", ix))
}
queues = append(queues, q)
for j := 0; j < numTenants; j++ {
requests = append(requests, fmt.Sprintf("%d-%d", n, j))
users = append(users, strconv.Itoa(j))
}
}
b.ResetTimer()
for n := 0; n < b.N; n++ {
for i := 0; i < maxOutstandingPerTenant; i++ {
for j := 0; j < numTenants; j++ {
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
err := queues[n].Enqueue(users[j], nil, requests[j], nil)
if err != nil {
b.Fatal(err)
}
}
}
}
}
func TestRequestQueue_GetNextRequestForQuerier_ShouldGetRequestAfterReshardingBecauseQuerierHasBeenForgotten(t *testing.T) {
const forgetDelay = 3 * time.Second
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
queue := NewRequestQueue(1, forgetDelay, &mockQueueLimits{maxConsumers: 1}, NewMetrics(nil, constants.Loki, "query_scheduler"))
// Start the queue service.
ctx := context.Background()
require.NoError(t, services.StartAndAwaitRunning(ctx, queue))
t.Cleanup(func() {
require.NoError(t, services.StopAndAwaitTerminated(ctx, queue))
})
// Two queriers connect.
queue.RegisterConsumerConnection("querier-1")
queue.RegisterConsumerConnection("querier-2")
// Querier-2 waits for a new request.
querier2wg := sync.WaitGroup{}
querier2wg.Add(1)
go func() {
defer querier2wg.Done()
_, _, err := queue.Dequeue(ctx, StartIndex, "querier-2")
require.NoError(t, err)
}()
// Querier-1 crashes (no graceful shutdown notification).
queue.UnregisterConsumerConnection("querier-1")
// Enqueue a request from an user which would be assigned to querier-1.
// NOTE: "user-1" hash falls in the querier-1 shard.
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
require.NoError(t, queue.Enqueue("user-1", nil, "request", nil))
startTime := time.Now()
querier2wg.Wait()
waitTime := time.Since(startTime)
// We expect that querier-2 got the request only after querier-1 forget delay is passed.
assert.GreaterOrEqual(t, waitTime.Milliseconds(), forgetDelay.Milliseconds())
}
func TestContextCond(t *testing.T) {
t.Run("wait until broadcast", func(t *testing.T) {
t.Parallel()
mtx := &sync.Mutex{}
cond := contextCond{Cond: sync.NewCond(mtx)}
doneWaiting := make(chan struct{})
mtx.Lock()
go func() {
cond.Wait(context.Background())
mtx.Unlock()
close(doneWaiting)
}()
assertChanNotReceived(t, doneWaiting, 100*time.Millisecond, "cond.Wait returned, but it should not because we did not broadcast yet")
cond.Broadcast()
assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after broadcast")
})
t.Run("wait until context deadline", func(t *testing.T) {
t.Parallel()
mtx := &sync.Mutex{}
cond := contextCond{Cond: sync.NewCond(mtx)}
doneWaiting := make(chan struct{})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
mtx.Lock()
go func() {
cond.Wait(ctx)
mtx.Unlock()
close(doneWaiting)
}()
assertChanNotReceived(t, doneWaiting, 100*time.Millisecond, "cond.Wait returned, but it should not because we did not broadcast yet and didn't cancel the context")
cancel()
assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after cancelling the context")
})
t.Run("wait on already canceled context", func(t *testing.T) {
// This test represents the racy real world scenario,
// we don't know whether it's going to wait before the broadcast triggered by the context cancellation.
t.Parallel()
mtx := &sync.Mutex{}
cond := contextCond{Cond: sync.NewCond(mtx)}
doneWaiting := make(chan struct{})
alreadyCanceledContext, cancel := context.WithCancel(context.Background())
cancel()
mtx.Lock()
go func() {
cond.Wait(alreadyCanceledContext)
mtx.Unlock()
close(doneWaiting)
}()
assertChanReceived(t, doneWaiting, 250*time.Millisecond, "cond.Wait did not return after cancelling the context")
})
t.Run("wait on already canceled context, but it takes a while to wait", func(t *testing.T) {
t.Parallel()
mtx := &sync.Mutex{}
cond := contextCond{
Cond: sync.NewCond(mtx),
testHookBeforeWaiting: func() {
// This makes the waiting goroutine so slow that out Wait(ctx) will need to broadcast once it sees it waiting.
time.Sleep(250 * time.Millisecond)
},
}
doneWaiting := make(chan struct{})
alreadyCanceledContext, cancel := context.WithCancel(context.Background())
cancel()
mtx.Lock()
go func() {
cond.Wait(alreadyCanceledContext)
mtx.Unlock()
close(doneWaiting)
}()
assertChanReceived(t, doneWaiting, time.Second, "cond.Wait did not return after 500ms")
})
t.Run("lots of goroutines waiting at the same time, none of them misses it's broadcast from cancel", func(t *testing.T) {
t.Parallel()
mtx := &sync.Mutex{}
cond := contextCond{
Cond: sync.NewCond(mtx),
testHookBeforeWaiting: func() {
// Wait just a little bit to create every goroutine
time.Sleep(time.Millisecond)
},
}
const goroutines = 100
doneWaiting := make(chan struct{}, goroutines)
release := make(chan struct{})
ctx, cancel := context.WithCancel(context.Background())
cancel()
for i := 0; i < goroutines; i++ {
go func() {
<-release
mtx.Lock()
cond.Wait(ctx)
mtx.Unlock()
doneWaiting <- struct{}{}
}()
}
go func() {
<-release
cancel()
}()
close(release)
assert.Eventually(t, func() bool {
return len(doneWaiting) == goroutines
}, time.Second, 10*time.Millisecond)
})
}
func TestMaxQueueSize(t *testing.T) {
t.Run("queue size is tracked per tenant", func(t *testing.T) {
maxSize := 3
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
queue := NewRequestQueue(maxSize, 0, noQueueLimits, NewMetrics(nil, constants.Loki, "query_scheduler"))
queue.RegisterConsumerConnection("querier")
// enqueue maxSize items with different actors
// different actors have individual channels with maxSize length
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
assert.NoError(t, queue.Enqueue("tenant", []string{"user-a"}, 1, nil))
assert.NoError(t, queue.Enqueue("tenant", []string{"user-b"}, 2, nil))
assert.NoError(t, queue.Enqueue("tenant", []string{"user-c"}, 3, nil))
// max queue length per tenant is tracked globally for all actors within a tenant
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
err := queue.Enqueue("tenant", []string{"user-a"}, 4, nil)
assert.Equal(t, err, ErrTooManyRequests)
// dequeue and enqueue some items
_, _, err = queue.Dequeue(context.Background(), StartIndexWithLocalQueue, "querier")
assert.NoError(t, err)
_, _, err = queue.Dequeue(context.Background(), StartIndexWithLocalQueue, "querier")
assert.NoError(t, err)
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
assert.NoError(t, queue.Enqueue("tenant", []string{"user-a"}, 4, nil))
assert.NoError(t, queue.Enqueue("tenant", []string{"user-b"}, 5, nil))
config: adds `frontend.max-query-capacity` to tune per-tenant query capacity (#11284) **What this PR does / why we need it**: Adds a new config `frontend.max-query-capacity` that allows users to configure what portion of the the available querier replicas can be used by a tenant. `max_query_capacity` is the corresponding YAML option that can be configured in limits or runtime overrides. For example, setting this to 0.5 would allow a tenant to use half of the available queriers. This complements the existing `frontend.max-queriers-per-tenant`. When both are configured, the smaller value of the resulting querier replica count is considered: ``` min(frontend.max-queriers-per-tenant, ceil(querier_replicas * frontend.max-query-capacity)) ``` *All* queriers will handle requests for a tenant if neither limits are applied. **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: noticed that we don't pass down the shuffle sharding limits for frontend (only using it with schedulers) https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/loki/modules.go#L895 but the [docs](https://github.com/grafana/loki/blob/26f097162a856db48ecbd16bef2f0b750029855b/pkg/validation/limits.go#L276) mention that`frontend.max-queriers-per-tenant` applies to frontend as well. ``` This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL. ``` **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com> Co-authored-by: Danny Kopping <danny.kopping@grafana.com>
2 years ago
err = queue.Enqueue("tenant", []string{"user-c"}, 6, nil)
assert.Equal(t, err, ErrTooManyRequests)
})
}
DequeueMany fix (#11797) **What this PR does / why we need it**: Previously we relied only on the index of the tenant's queue to read the requests from the same tenant's queue. However, as long as the queue is aggressively used by the consumers in parallel, there are a few edge cases when knowing the index of last used tenant's queue is not enough to guarantee that we dequeue items from exactly the same tenant's queue. **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Signed-off-by: Vladyslav Diachenko <vlad.diachenko@grafana.com>
2 years ago
type mockLimits struct {
maxConsumer int
}
func (l *mockLimits) MaxConsumers(_ string, _ int) int {
return l.maxConsumer
}
func Test_Queue_DequeueMany(t *testing.T) {
tenantsQueueMaxSize := 100
tests := map[string]struct {
tenantsCount int
tasksPerTenant int
consumersCount int
dequeueBatchSize int
maxConsumersPerTenant int
consumerDelay time.Duration
senderDelay time.Duration
}{
"tenants-1_tasks-10_consumers-3_batch-2": {
tenantsCount: 1,
tasksPerTenant: 10,
consumersCount: 3,
dequeueBatchSize: 2,
},
"tenants-10_tasks-10_consumers-2_batch-10": {
tenantsCount: 10,
tasksPerTenant: 10,
consumersCount: 2,
dequeueBatchSize: 10,
},
"tenants-100_tasks-100_consumers-10_batch-10": {
tenantsCount: 100,
tasksPerTenant: 100,
consumersCount: 10,
dequeueBatchSize: 10,
},
"tenants-100_tasks-100_consumers-10_batch-10_consumerDelay-10": {
tenantsCount: 100,
tasksPerTenant: 100,
consumersCount: 10,
dequeueBatchSize: 10,
consumerDelay: 10 * time.Millisecond,
},
"tenants-100_tasks-100_consumers-10_batch-10_consumerDelay-10_senderDelay-10": {
tenantsCount: 100,
tasksPerTenant: 100,
consumersCount: 10,
dequeueBatchSize: 10,
consumerDelay: 10 * time.Millisecond,
senderDelay: 10 * time.Millisecond,
},
"tenants-10_tasks-50_consumers-10_batch-3": {
tenantsCount: 10,
tasksPerTenant: 50,
consumersCount: 10,
dequeueBatchSize: 3,
},
"tenants-10_tasks-50_consumers-1_batch-5": {
tenantsCount: 10,
tasksPerTenant: 50,
consumersCount: 1,
dequeueBatchSize: 5,
},
"tenants-5_tasks-10_consumers-1_batch-2": {
tenantsCount: 5,
tasksPerTenant: 10,
consumersCount: 1,
dequeueBatchSize: 2,
},
"tenants-1_tasks-10_consumers-1_batch-2": {
tenantsCount: 1,
tasksPerTenant: 10,
consumersCount: 1,
dequeueBatchSize: 2,
},
"tenants-1_tasks-10_consumers-10_batch-2": {
tenantsCount: 1,
tasksPerTenant: 10,
consumersCount: 10,
dequeueBatchSize: 2,
},
"tenants-100_tasks-10_consumers-10_batch-6_maxConsumersPerTenant-6": {
tenantsCount: 100,
tasksPerTenant: 10,
consumersCount: 10,
dequeueBatchSize: 6,
maxConsumersPerTenant: 4,
},
"tenants-200_tasks-10_consumers-100_batch-999999_maxConsumersPerTenant-4": {
tenantsCount: 200,
tasksPerTenant: 10,
consumersCount: 100,
dequeueBatchSize: 999_999,
maxConsumersPerTenant: 4,
},
"tenants-10_tasks-100_consumers-20_batch-5_maxConsumersPerTenant-16": {
tenantsCount: 10,
tasksPerTenant: 100,
consumersCount: 20,
dequeueBatchSize: 5,
maxConsumersPerTenant: 16,
},
"tenants-1_tasks-100_consumers-16_batch-5_maxConsumersPerTenant-2": {
tenantsCount: 1,
tasksPerTenant: 100,
consumersCount: 16,
dequeueBatchSize: 5,
maxConsumersPerTenant: 2,
},
}
for name, tt := range tests {
t.Run(name, func(t *testing.T) {
totalTasksCount := tt.tenantsCount * tt.tasksPerTenant
limits := &mockLimits{maxConsumer: tt.maxConsumersPerTenant}
require.LessOrEqual(t, tt.tasksPerTenant, tenantsQueueMaxSize, "test must be able to enqueue all the tasks without error")
queue := NewRequestQueue(tenantsQueueMaxSize, 0, limits, NewMetrics(nil, constants.Loki, "query_scheduler"))
for i := 0; i < tt.consumersCount; i++ {
queue.RegisterConsumerConnection(createConsumerName(i))
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
wg, _ := errgroup.WithContext(ctx)
mtx := &sync.Mutex{}
receivedTasksPerTenant := make(map[string]int, tt.tenantsCount)
receivedTasksCount := atomic.NewInt32(0)
for i := 0; i < tt.consumersCount; i++ {
consumer := createConsumerName(i)
wg.Go(func() error {
idx := StartIndexWithLocalQueue
for {
tasks, newIdx, err := queue.DequeueMany(ctx, idx, consumer, tt.dequeueBatchSize)
if err != nil && !errors.Is(err, context.Canceled) {
return fmt.Errorf("error while dequeueing many task by %s: %w", consumer, err)
}
if err == nil && len(tasks) == 0 {
return fmt.Errorf("%s must receive at least one item if there is no error, but got 0", consumer)
}
if len(tasks) > 1 && !isAllItemsSame(tasks) {
return fmt.Errorf("expected all items to be from the same tenant, but they were not: %v", tasks)
}
time.Sleep(tt.consumerDelay)
collectTasks(mtx, err, tasks, receivedTasksPerTenant)
receivedTasksTotal := receivedTasksCount.Add(int32(len(tasks)))
// put the slice back to the pool
queue.ReleaseRequests(tasks)
if receivedTasksTotal == int32(totalTasksCount) {
//cancel the context to release the rest of the consumers once we received all the tasks from all the queues
cancel()
return nil
}
idx = newIdx
}
})
}
enqueueTasksAsync(tt.tenantsCount, tt.tasksPerTenant, tt.senderDelay, wg, queue)
require.NoError(t, wg.Wait())
require.Len(t, receivedTasksPerTenant, tt.tenantsCount)
for tenant, actualTasksCount := range receivedTasksPerTenant {
require.Equalf(t, tt.tasksPerTenant, actualTasksCount, "%s received less tasks than expected", tenant)
}
})
}
}
func enqueueTasksAsync(tenantsCount int, tasksPerTenant int, senderDelay time.Duration, wg *errgroup.Group, queue *RequestQueue) {
for i := 0; i < tenantsCount; i++ {
tenant := fmt.Sprintf("tenant-%d", i)
wg.Go(func() error {
for j := 0; j < tasksPerTenant; j++ {
err := queue.Enqueue(tenant, []string{}, tenant, nil)
if err != nil {
return fmt.Errorf("error while enqueueing task for the %s : %w", tenant, err)
}
time.Sleep(senderDelay)
}
return nil
})
}
}
func collectTasks(mtx *sync.Mutex, err error, tasks []Request, receivedTasksPerTenant map[string]int) {
if err != nil {
return
}
mtx.Lock()
defer mtx.Unlock()
//tenant name is sent as task
tenant := tasks[0]
receivedTasksPerTenant[fmt.Sprintf("%v", tenant)] += len(tasks)
}
func createConsumerName(i int) string {
return fmt.Sprintf("consumer-%d", i)
}
func isAllItemsSame[T comparable](items []T) bool {
firstItem := items[0]
for i := 1; i < len(items); i++ {
if items[i] != firstItem {
return false
}
}
return true
}
func assertChanReceived(t *testing.T, c chan struct{}, timeout time.Duration, msg string) {
t.Helper()
select {
case <-c:
case <-time.After(timeout):
t.Fatalf(msg)
}
}
func assertChanNotReceived(t *testing.T, c chan struct{}, wait time.Duration, msg string, args ...interface{}) {
t.Helper()
select {
case <-c:
t.Fatalf(msg, args...)
case <-time.After(wait):
// OK!
}
}